Import Library¶

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker

import seaborn as sns
from scipy import stats
import plotly.express as px
import folium

# from keras.models import Sequential
# from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

Load the dataset¶

In [2]:
data = pd.read_csv('Measurement_summary.csv')
data_info = pd.read_csv('Measurement_item_info.csv')

Show the first few rows of the dataset¶

In [3]:
data.head()
Out[3]:
Measurement date Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 PM2.5
0 1/1/2017 0:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.004 0.059 0.002 1.2 73 57
1 1/1/2017 1:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.004 0.058 0.002 1.2 71 59
2 1/1/2017 2:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.004 0.056 0.002 1.2 70 59
3 1/1/2017 3:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.004 0.056 0.002 1.2 70 58
4 1/1/2017 4:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.003 0.051 0.002 1.2 69 61
In [4]:
data_info.head()
Out[4]:
Item code Item name Unit of measurement Good(Blue) Normal(Green) Bad(Yellow) Very bad(Red)
0 1 SO2 ppm 0.02 0.05 0.15 1.0
1 3 NO2 ppm 0.03 0.06 0.20 2.0
2 5 CO ppm 2.00 9.00 15.00 50.0
3 6 O3 ppm 0.03 0.09 0.15 0.5
4 8 PM10 Mircrogram/m3 30.00 80.00 150.00 600.0

Create Categorization¶

In [5]:
# Create dictionary from the provided information
conditions = {
    'SO2': {'Good': (0, 0.02), 'Normal': (0.02, 0.05), 'Bad': (0.05, 0.15), 'Very Bad': (0.15, 1)},
    'NO2': {'Good': (0, 0.03), 'Normal': (0.03, 0.06), 'Bad': (0.06, 0.2), 'Very Bad': (0.2, 2)},
    'CO': {'Good': (0, 2), 'Normal': (2, 9), 'Bad': (9, 15), 'Very Bad': (15, 50)},
    'O3': {'Good': (0, 0.03), 'Normal': (0.03, 0.09), 'Bad': (0.09, 0.15), 'Very Bad': (0.15, 0.5)},
    'PM10': {'Good': (0, 30), 'Normal': (30, 80), 'Bad': (80, 150), 'Very Bad': (150, 600)},
    'PM2.5': {'Good': (0, 15), 'Normal': (15, 35), 'Bad': (35, 75), 'Very Bad': (75, 500)},
}

# Function to categorize pollutant levels
def categorize(level, condition):
    if condition['Good'][0] <= level < condition['Good'][1]:
        return 'Good'
    elif condition['Normal'][0] <= level < condition['Normal'][1]:
        return 'Normal'
    elif condition['Bad'][0] <= level < condition['Bad'][1]:
        return 'Bad'
    else:
        return 'Very Bad'

# Apply the function to each pollutant
for pollutant in conditions.keys():
    data[f'{pollutant}_Category'] = data[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
In [6]:
data.sample(5)
Out[6]:
Measurement date Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
32391 9/28/2017 6:00 102 Jung-gu 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... 37.564263 126.974676 0.003 0.027 0.005 0.4 11 7 Good Good Good Good Good Good
503046 4/2/2018 8:00 120 Dongjak-gu 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... 37.480917 126.971481 0.004 0.031 0.029 0.4 59 27 Good Normal Good Good Normal Normal
191609 3/6/2018 17:00 108 Gwangjin-gu 571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi... 37.547180 127.092493 0.005 0.030 0.028 0.5 37 19 Good Normal Good Good Normal Normal
158929 5/27/2017 20:00 107 Seongdong-gu 18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re... 37.541864 127.049659 0.004 0.041 0.028 0.2 91 21 Good Normal Good Good Bad Normal
706 1/30/2017 10:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.004 0.010 0.033 0.3 35 30 Good Good Good Normal Normal Normal
In [7]:
# data.to_csv("Measurement_summary_fix.csv")

Create Categorization to Numeric¶

In [8]:
# Create a dictionary to map categories to numbers
category_to_numeric = {'Good': 1, 'Normal': 2, 'Bad': 3, 'Very Bad': 4}

# Changing categories to be numeric for each category column
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    data[f'{pollutant}_Category'] = data[f'{pollutant}_Category'].map(category_to_numeric)
In [9]:
data.sample(5)
Out[9]:
Measurement date Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
361989 12/6/2019 19:00 114 Nowon-gu 17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ... 37.658774 127.068505 0.006 0.045 0.004 0.8 26 20 1 2 1 1 1 2
540517 8/9/2019 20:00 121 Gwanak-gu 14, Sillimdong-gil, Gwanak-gu, Seoul, Republic... 37.487355 126.927102 0.004 0.026 0.010 0.2 19 10 1 1 1 1 1 1
620338 11/8/2019 15:00 124 Songpa-gu 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... 37.502686 127.092509 0.002 0.034 0.021 0.4 29 12 1 2 1 1 1 1
415312 2/7/2017 20:00 117 Guro-gu 45, Gamasan-ro 27-gil, Guro-gu, Seoul, Republi... 37.498498 126.889692 0.007 0.037 0.003 0.6 54 21 1 2 1 1 2 2
13846 7/31/2018 22:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 0.003 0.017 0.037 0.4 27 18 1 1 1 2 1 2

Reformating¶

In [10]:
# Convert 'Measurement date' to datetime
data['Measurement date'] = pd.to_datetime(data['Measurement date'])

# Check the data types to confirm the conversion
data.dtypes
Out[10]:
Measurement date          datetime64[ns]
Station code                       int64
Station name(district)            object
Address                           object
Latitude                         float64
Longitude                        float64
SO2                              float64
NO2                              float64
O3                               float64
CO                               float64
PM10                               int64
PM2.5                              int64
SO2_Category                       int64
NO2_Category                       int64
CO_Category                        int64
O3_Category                        int64
PM10_Category                      int64
PM2.5_Category                     int64
dtype: object
In [11]:
data.sample(5)
Out[11]:
Measurement date Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
180803 2019-12-11 11:00:00 107 Seongdong-gu 18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re... 37.541864 127.049659 0.005 0.023 0.032 0.7 112 66 1 1 1 2 3 3
567014 2019-09-03 16:00:00 122 Seocho-gu 16, Sinbanpo-ro 15-gil, Seocho-gu, Seoul, Repu... 37.504547 126.994458 0.002 0.018 0.037 0.2 12 10 1 1 1 2 1 1
510140 2019-01-22 23:00:00 120 Dongjak-gu 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... 37.480917 126.971481 0.005 0.041 0.017 0.7 76 50 1 2 1 1 2 3
413422 2019-11-20 14:00:00 116 Gangseo-gu 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... 37.544640 126.835151 0.004 0.014 0.030 0.3 16 10 1 1 1 2 1 1
595938 2017-01-10 20:00:00 124 Songpa-gu 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... 37.502686 127.092509 0.005 0.017 0.025 0.2 23 7 1 1 1 1 1 1
In [12]:
# Melt the pollutants and their categories into single columns
data_melted = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude'],
                        value_vars=['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5'],
                        var_name='Air Pollutants', value_name='Measurement Value')

data_melted_categories = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude'],
                                   value_vars=['SO2_Category', 'NO2_Category', 'CO_Category', 'O3_Category', 'PM10_Category', 'PM2.5_Category'],
                                   var_name='Air Pollutants', value_name='Quality')
In [13]:
data_melted.sample(5)
Out[13]:
Measurement date Station code Station name(district) Address Latitude Longitude Air Pollutants Measurement Value
45697 2019-04-14 23:00:00 102 Jung-gu 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... 37.564263 126.974676 SO2 0.002
3749308 2019-04-09 10:00:00 120 Dongjak-gu 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... 37.480917 126.971481 PM2.5 11.000
1424687 2017-01-07 21:00:00 106 Mapo-gu 10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o... 37.555580 126.905597 CO 1.100
2899056 2019-10-18 03:00:00 112 Gangbuk-gu 49, Samyang-ro 139-gil, Gangbuk-gu, Seoul, Rep... 37.647930 127.011952 PM10 18.000
2383615 2017-02-02 18:00:00 118 Geumcheon-gu 20, Geumha-ro 21-gil, Geumcheon-gu, Seoul, Rep... 37.452357 126.908296 O3 0.010
In [14]:
data_melted_categories.head(5)
Out[14]:
Measurement date Station code Station name(district) Address Latitude Longitude Air Pollutants Quality
0 2017-01-01 00:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2_Category 1
1 2017-01-01 01:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2_Category 1
2 2017-01-01 02:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2_Category 1
3 2017-01-01 03:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2_Category 1
4 2017-01-01 04:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2_Category 1
In [15]:
# Remove the "_Category" part from the 'Air Pollutants' column in the categories dataframe
data_melted_categories['Air Pollutants'] = data_melted_categories['Air Pollutants'].str.replace('_Category', '')
In [16]:
# Merge the two dataframes on the common columns
data_merged = pd.merge(data_melted, data_melted_categories,
                       on=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude', 'Air Pollutants'])
In [17]:
data_merged.head(5)
Out[17]:
Measurement date Station code Station name(district) Address Latitude Longitude Air Pollutants Measurement Value Quality
0 2017-01-01 00:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2 0.004 1
1 2017-01-01 01:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2 0.004 1
2 2017-01-01 02:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2 0.004 1
3 2017-01-01 03:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2 0.004 1
4 2017-01-01 04:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 SO2 0.003 1
In [18]:
# data.to_csv("Measurement_summary_fix_pisan.csv")

Exploratory Data Analysis¶

Summary statistics¶

In [19]:
summary = data.describe()
In [20]:
summary
Out[20]:
Station code Latitude Longitude SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
count 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000 647511.000000
mean 113.000221 37.553484 126.989340 -0.001795 0.022519 0.017979 0.509197 43.708051 25.411995 1.020259 1.463720 1.020129 1.355406 1.723722 1.894979
std 7.211315 0.053273 0.078790 0.078832 0.115153 0.099308 0.405319 71.137342 43.924595 0.241154 0.615827 0.240415 0.530487 0.681677 0.800266
min 101.000000 37.452357 126.835151 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 107.000000 37.517528 126.927102 0.003000 0.016000 0.008000 0.300000 22.000000 11.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
50% 113.000000 37.544962 127.004850 0.004000 0.025000 0.021000 0.500000 35.000000 19.000000 1.000000 1.000000 1.000000 1.000000 2.000000 2.000000
75% 119.000000 37.584848 127.047470 0.005000 0.038000 0.034000 0.600000 53.000000 31.000000 1.000000 2.000000 1.000000 2.000000 2.000000 2.000000
max 125.000000 37.658774 127.136792 3.736000 38.445000 33.600000 71.700000 3586.000000 6256.000000 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000

Check for missing values¶

In [21]:
missing_values = data.isnull().sum()
In [22]:
missing_values
Out[22]:
Measurement date          0
Station code              0
Station name(district)    0
Address                   0
Latitude                  0
Longitude                 0
SO2                       0
NO2                       0
O3                        0
CO                        0
PM10                      0
PM2.5                     0
SO2_Category              0
NO2_Category              0
CO_Category               0
O3_Category               0
PM10_Category             0
PM2.5_Category            0
dtype: int64

Histograms of numerical columns¶

In [23]:
numerical_columns = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']
data[numerical_columns].hist(bins=30, figsize=(10, 10), layout=(3, 2));
In [24]:
numerical_columns_category = ['SO2_Category', 'NO2_Category', 'O3_Category', 'CO_Category', 'PM10_Category', 'PM2.5_Category']
data[numerical_columns_category].hist(bins=30, figsize=(10, 10), layout=(3, 2));
In [25]:
numerical_columns_merge = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5','SO2_Category', 'NO2_Category', 'O3_Category', 'CO_Category', 'PM10_Category', 'PM2.5_Category']

Outlier Detection¶

In [26]:
# Visualize the data using boxplots to identify any outliers
fig, axes = plt.subplots(3, 2, figsize=(15, 10))

for ax, pollutant in zip(axes.flatten(), numerical_columns):
    sns.boxplot(data=data, x=pollutant, ax=ax)
    ax.set_title(f'Boxplot of {pollutant}')

plt.tight_layout()

Correlation Matrix¶

Correlation Matrix Pollutants¶

In [27]:
correlation_matrix = data[numerical_columns].corr()
In [28]:
correlation_matrix
Out[28]:
SO2 NO2 O3 CO PM10 PM2.5
SO2 1.000000 0.712422 0.805551 0.304923 0.048573 0.047531
NO2 0.712422 1.000000 0.785805 0.245746 0.055532 0.057844
O3 0.805551 0.785805 1.000000 0.188998 0.038602 0.033868
CO 0.304923 0.245746 0.188998 1.000000 0.151166 0.182867
PM10 0.048573 0.055532 0.038602 0.151166 1.000000 0.228984
PM2.5 0.047531 0.057844 0.033868 0.182867 0.228984 1.000000
In [29]:
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants')
plt.show()

Corelation Matrix Category¶

In [30]:
correlation_matrix_category = data[numerical_columns_category].corr()
In [31]:
correlation_matrix_category
Out[31]:
SO2_Category NO2_Category O3_Category CO_Category PM10_Category PM2.5_Category
SO2_Category 1.000000 0.298820 0.367557 0.929548 0.240768 0.190424
NO2_Category 0.298820 1.000000 -0.176201 0.299551 0.418563 0.459237
O3_Category 0.367557 -0.176201 1.000000 0.364387 0.129657 0.083108
CO_Category 0.929548 0.299551 0.364387 1.000000 0.238802 0.187704
PM10_Category 0.240768 0.418563 0.129657 0.238802 1.000000 0.714912
PM2.5_Category 0.190424 0.459237 0.083108 0.187704 0.714912 1.000000
In [32]:
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_category, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants Category')
plt.show()

Correlation Matrix Merge¶

In [33]:
correlation_matrix_merge = data[numerical_columns_merge].corr()
In [34]:
correlation_matrix_merge
Out[34]:
SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category O3_Category CO_Category PM10_Category PM2.5_Category
SO2 1.000000 0.712422 0.805551 0.304923 0.048573 0.047531 -0.956390 -0.300650 -0.370886 -0.918129 -0.231661 -0.181064
NO2 0.712422 1.000000 0.785805 0.245746 0.055532 0.057844 -0.629877 -0.092411 -0.300536 -0.648468 -0.111381 -0.066652
O3 0.805551 0.785805 1.000000 0.188998 0.038602 0.033868 -0.729566 -0.325459 -0.166421 -0.749754 -0.188002 -0.153828
CO 0.304923 0.245746 0.188998 1.000000 0.151166 0.182867 -0.172866 0.252376 -0.237336 -0.173522 0.208530 0.270182
PM10 0.048573 0.055532 0.038602 0.151166 1.000000 0.228984 -0.034102 0.135400 -0.001572 -0.038565 0.451615 0.296372
PM2.5 0.047531 0.057844 0.033868 0.182867 0.228984 1.000000 -0.028135 0.158530 -0.012344 -0.032738 0.308582 0.460219
SO2_Category -0.956390 -0.629877 -0.729566 -0.172866 -0.034102 -0.028135 1.000000 0.298820 0.367557 0.929548 0.240768 0.190424
NO2_Category -0.300650 -0.092411 -0.325459 0.252376 0.135400 0.158530 0.298820 1.000000 -0.176201 0.299551 0.418563 0.459237
O3_Category -0.370886 -0.300536 -0.166421 -0.237336 -0.001572 -0.012344 0.367557 -0.176201 1.000000 0.364387 0.129657 0.083108
CO_Category -0.918129 -0.648468 -0.749754 -0.173522 -0.038565 -0.032738 0.929548 0.299551 0.364387 1.000000 0.238802 0.187704
PM10_Category -0.231661 -0.111381 -0.188002 0.208530 0.451615 0.308582 0.240768 0.418563 0.129657 0.238802 1.000000 0.714912
PM2.5_Category -0.181064 -0.066652 -0.153828 0.270182 0.296372 0.460219 0.190424 0.459237 0.083108 0.187704 0.714912 1.000000
In [35]:
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_merge, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants with Category')
plt.show()

Reformating Negative Value and Null Values¶

In [36]:
# Calculate mean of the positive values for each pollutant
means = {pollutant: data[data[pollutant] >= 0][pollutant].mean() for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']}

# Replace negative values with the mean
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    data[pollutant] = data[pollutant].mask(data[pollutant] < 0).fillna(means[pollutant])

Normalisasi dan Standarisasi¶

In [37]:
def normalize_data(data):
    scaler = MinMaxScaler()
    data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
    return data_normalized

def standardize_data(data):
    scaler = StandardScaler()
    data_standardized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
    return data_standardized

Trend Analysis¶

In [38]:
# Create 'Yearly' column
data['Year'] = data['Measurement date'].dt.year

# Create 'Month' column
data['Month'] = data['Measurement date'].dt.month

# Create 'Quarter' column
data['Quarter'] = data['Measurement date'].dt.quarter

# Extract hour from 'Measurement date'
data['Hour'] = data['Measurement date'].dt.hour

# Calculate the yearly average level of each pollutant
average_pollutant_per_year = data.groupby('Year')[numerical_columns].mean()
average_pollutant_per_year
Out[38]:
SO2 NO2 O3 CO PM10 PM2.5
Year
2017 0.004627 0.029350 0.024844 0.520167 44.455520 24.613483
2018 0.004434 0.028471 0.023495 0.502713 41.982476 24.262548
2019 0.004042 0.027979 0.024883 0.533763 45.581123 27.952126
In [39]:
# Apply the function to each pollutant
for pollutant in conditions.keys():
    average_pollutant_per_year[f'{pollutant}_Category'] = average_pollutant_per_year[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
In [40]:
average_pollutant_per_year
Out[40]:
SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
Year
2017 0.004627 0.029350 0.024844 0.520167 44.455520 24.613483 Good Good Good Good Normal Normal
2018 0.004434 0.028471 0.023495 0.502713 41.982476 24.262548 Good Good Good Good Normal Normal
2019 0.004042 0.027979 0.024883 0.533763 45.581123 27.952126 Good Good Good Good Normal Normal
In [41]:
# Plot the average pollutant levels per year
plt.figure(figsize=(14, 8))
for pollutant in numerical_columns:
    plt.plot(average_pollutant_per_year.index, average_pollutant_per_year[pollutant], label=pollutant)
plt.xlabel('Year')
plt.ylabel('Average Level')
plt.title('Yearly Average Level of Each Pollutant')
plt.legend()
plt.grid(True)
plt.show()

Time-Series Analysis¶

In [42]:
# Set 'Measurement date' as index
data.set_index('Measurement date', inplace=True)

# Plot the time-series data for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    data[pollutant].plot(figsize=(10, 5))
    plt.title(f'Time-Series Plot of {pollutant} Levels')
    plt.ylabel('Level')
    plt.show()

Yearly¶

In [43]:
# Calculate yearly averages and convert 'Year' to integer
data_yearly = data.groupby('Year')[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()
data_yearly['Year'] = data_yearly['Year'].astype(int)

# Set 'Year' as index again
data_yearly.set_index('Year', inplace=True)

# Normalize and standardize data
data_normalized_yearly = normalize_data(data_yearly)
data_standardized_yearly = standardize_data(data_yearly)

# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    fig, ax = plt.subplots(figsize=(10, 5))

    ax.plot(data_yearly.index, data_yearly[pollutant], label='Original')
    ax.plot(data_normalized_yearly.index, data_normalized_yearly[pollutant], label='Normalized')
    ax.plot(data_standardized_yearly.index, data_standardized_yearly[pollutant], label='Standardized')

    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%d'))
    ax.set_xticks(data_yearly.index)
    ax.set_xticklabels(data_yearly.index)

    ax.set_xlabel('Year')

    plt.title(f'Yearly Time-Series Plot of {pollutant} Levels')
    plt.ylabel('Average Level')
    plt.legend()
    plt.show()
In [44]:
data_last_year = data[data['Year'] == data['Year'].max()]

# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))

for i, pollutant in enumerate(numerical_columns, 1):
    plt.subplot(3, 2, i)
    data_last_year.groupby('Measurement date')[pollutant].mean().plot()
    plt.title(pollutant)
    plt.xlabel('Time')
    plt.ylabel('Average Level')

plt.tight_layout()
plt.show()
In [45]:
data_last_year.sample(5)
Out[45]:
Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 ... SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category Year Month Quarter Hour
Measurement date
2019-07-23 22:00:00 110 Jungnang-gu 369, Yongmasan-ro, Jungnang-gu, Seoul, Republi... 37.584848 127.094023 0.007 0.014 0.010 0.2 15.0 ... 1 1 1 1 1 1 2019 7 3 22
2019-05-09 13:00:00 103 Yongsan-gu 136, Hannam-daero, Yongsan-gu, Seoul, Republic... 37.540033 127.004850 0.005 0.036 0.051 0.4 42.0 ... 1 2 1 2 2 2 2019 5 2 13
2019-09-10 23:00:00 123 Gangnam-gu 426, Hakdong-ro, Gangnam-gu, Seoul, Republic o... 37.517528 127.047470 0.002 0.013 0.072 0.3 33.0 ... 1 1 1 2 2 2 2019 9 3 23
2019-10-13 05:00:00 113 Dobong-gu 34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub... 37.654192 127.029088 0.002 0.013 0.016 0.4 17.0 ... 1 1 1 1 1 1 2019 10 4 5
2019-06-25 03:00:00 116 Gangseo-gu 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... 37.544640 126.835151 0.004 0.018 0.035 0.4 30.0 ... 1 1 1 2 2 2 2019 6 2 3

5 rows × 21 columns

Monhtly¶

In [46]:
# Calculate monthly averages for each year
data_monthly = data.groupby(['Year', 'Month'])[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()

# Normalize and standardize data
data_normalized_monthly = normalize_data(data_monthly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
data_standardized_monthly = standardize_data(data_monthly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])

# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    plt.figure(figsize=(10, 5))

    for year in data_monthly['Year'].unique():
        monthly_data_for_year = data_monthly[data_monthly['Year'] == year]
        normalized_data_for_year = data_normalized_monthly[data_monthly['Year'] == year]
        standardized_data_for_year = data_standardized_monthly[data_monthly['Year'] == year]

        plt.plot(monthly_data_for_year['Month'], monthly_data_for_year[pollutant], label=f'Original {year}')
        # plt.plot(normalized_data_for_year['Month'], normalized_data_for_year[pollutant], label=f'Normalized {year}')
        # plt.plot(standardized_data_for_year['Month'], standardized_data_for_year[pollutant], label=f'Standardized {year}')

    plt.title(f'Monthly Time-Series Plot of {pollutant} Levels')
    plt.xlabel('Month')
    plt.ylabel('Average Level')
    plt.legend()
    plt.show()
In [47]:
data_last_month = data[data['Month'] == data['Month'].max()]

# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))

for i, pollutant in enumerate(numerical_columns, 1):
    plt.subplot(3, 2, i)
    data_last_month.groupby('Measurement date')[pollutant].mean().plot()
    plt.title(pollutant)
    plt.xlabel('Time')
    plt.ylabel('Average Level')

plt.tight_layout()
plt.show()
In [48]:
data_last_month.sample(5)
Out[48]:
Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 ... SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category Year Month Quarter Hour
Measurement date
2017-12-06 09:00:00 121 Gwanak-gu 14, Sillimdong-gil, Gwanak-gu, Seoul, Republic... 37.487355 126.927102 0.006 0.040 0.005 0.6 30.0 ... 1 2 1 1 2 2 2017 12 4 9
2018-12-19 21:00:00 124 Songpa-gu 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... 37.502686 127.092509 0.005 0.069 0.002 1.0 68.0 ... 1 3 1 1 2 3 2018 12 4 21
2017-12-23 10:00:00 108 Gwangjin-gu 571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi... 37.547180 127.092493 0.006 0.067 0.003 1.0 102.0 ... 1 3 1 1 3 3 2017 12 4 10
2018-12-14 10:00:00 105 Seodaemun-gu 32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,... 37.593742 126.949679 0.007 0.019 0.015 0.5 34.0 ... 1 1 1 1 2 2 2018 12 4 10
2019-12-22 09:00:00 106 Mapo-gu 10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o... 37.555580 126.905597 0.004 0.041 0.003 0.9 50.0 ... 1 2 1 1 2 3 2019 12 4 9

5 rows × 21 columns

Quater¶

In [49]:
# Calculate quarterly averages
data_quarterly = data.groupby(['Year','Quarter'])[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()

# Normalize and standardize data
data_normalized_quarterly = normalize_data(data_quarterly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
data_standardized_quarterly = standardize_data(data_quarterly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])

# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    plt.figure(figsize=(10, 5))

    for year in data_quarterly['Year'].unique():
        quarterly_data_for_year = data_quarterly[data_quarterly['Year'] == year]
        quarterly_normalized_data_for_year = data_normalized_quarterly[data_quarterly['Year'] == year]
        quarterly_standardized_data_for_year = data_standardized_quarterly[data_quarterly['Year'] == year]

        plt.plot(quarterly_data_for_year['Quarter'], quarterly_data_for_year[pollutant], label=f'Original {year}')
        # plt.plot(quarterly_normalized_data_for_year['Quarter'], quarterly_normalized_data_for_year[pollutant], label=f'Normalized {year}')
        # plt.plot(quarterly_standardized_data_for_year['Quarter'], quarterly_standardized_data_for_year[pollutant], label=f'Standardized {year}')

    plt.title(f'Quarterly Time-Series Plot of {pollutant} Levels')
    plt.xlabel('Quarterly')
    plt.ylabel('Average Level')
    plt.legend()
    plt.show()
In [50]:
data_last_quaeter = data[data['Quarter'] == data['Quarter'].max()]

# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))

for i, pollutant in enumerate(numerical_columns, 1):
    plt.subplot(3, 2, i)
    data_last_quaeter.groupby('Measurement date')[pollutant].mean().plot()
    plt.title(pollutant)
    plt.xlabel('Time')
    plt.ylabel('Average Level')

plt.tight_layout()
plt.show()
In [51]:
data_last_quaeter.sample(5)
Out[51]:
Station code Station name(district) Address Latitude Longitude SO2 NO2 O3 CO PM10 ... SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category Year Month Quarter Hour
Measurement date
2018-11-19 11:00:00 109 Dongdaemun-gu 43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul... 37.575743 127.028885 0.010000 0.071000 0.0060 1.100000 74.000000 ... 1 3 1 1 2 3 2018 11 4 11
2017-12-18 03:00:00 116 Gangseo-gu 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... 37.544640 126.835151 0.004372 0.028609 0.0244 0.518663 43.983296 ... 4 4 4 4 4 4 2017 12 4 3
2017-12-13 18:00:00 102 Jung-gu 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... 37.564263 126.974676 0.003000 0.020000 0.0220 0.400000 28.000000 ... 1 1 1 1 1 2 2017 12 4 18
2018-12-26 22:00:00 116 Gangseo-gu 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... 37.544640 126.835151 0.005000 0.015000 0.0250 0.400000 28.000000 ... 1 1 1 1 1 1 2018 12 4 22
2018-11-03 01:00:00 112 Gangbuk-gu 49, Samyang-ro 139-gil, Gangbuk-gu, Seoul, Rep... 37.647930 127.011952 0.002000 0.041000 0.0060 0.600000 26.000000 ... 1 2 1 1 1 2 2018 11 4 1

5 rows × 21 columns

Seasonality Analysis¶

In [52]:
# Calculate the monthly average level of each pollutant
average_pollutant_per_month = data.groupby('Month')[numerical_columns].mean()

# Plotting the monthly average level of each pollutant
fig, axes = plt.subplots(3, 2, figsize=(18, 12))

for ax, pollutant in zip(axes.flatten(), numerical_columns):
    average_pollutant_per_month[pollutant].plot(ax=ax)
    ax.set_title(f'Monthly Average {pollutant} Levels')
    ax.set_xlabel('Month')
    ax.set_ylabel('Average Level')

plt.tight_layout()
plt.show()

average_pollutant_per_month
Out[52]:
SO2 NO2 O3 CO PM10 PM2.5
Month
1 0.005217 0.036096 0.013289 0.696928 59.507841 35.195308
2 0.005122 0.034629 0.018146 0.643196 52.947707 32.479834
3 0.005072 0.037306 0.027614 0.590821 61.714911 40.929212
4 0.004398 0.029975 0.031333 0.479555 50.543532 25.562793
5 0.004561 0.027057 0.038332 0.455426 54.104655 26.589209
6 0.004067 0.022784 0.038208 0.417441 38.082114 24.410237
7 0.003942 0.019639 0.028244 0.395531 30.469540 20.858209
8 0.003783 0.018679 0.028110 0.390576 27.074553 16.273837
9 0.003704 0.022365 0.025929 0.425334 28.163988 16.137306
10 0.003785 0.026266 0.019915 0.473149 29.954536 15.960582
11 0.004429 0.035189 0.012832 0.615757 47.963381 24.770641
12 0.004482 0.034329 0.011228 0.650039 49.122613 29.238891

Boxplots for Each Pollutant by Month¶

In [53]:
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='Month', y=pollutant, data=data)
    plt.title(f'Boxplot of {pollutant} Levels by Month')
    plt.ylabel('Level')
    plt.show()

Boxplots for Each Pollutant by Hour of the Day¶

In [54]:
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='Hour', y=pollutant, data=data)
    plt.title(f'Boxplot of {pollutant} Levels by Hour of the Day')
    plt.ylabel('Level')
    plt.show()
In [55]:
data = data.drop(columns=["Hour"])

Location Analysis¶

Location Station¶

In [56]:
# Location analysis: average level of each pollutant at each station
average_pollutant_per_station = data.groupby('Station name(district)')[numerical_columns].mean()

# Bar plots of the average pollutant levels at each station
fig, axes = plt.subplots(3, 2, figsize=(20, 15))

for ax, pollutant in zip(axes.flatten(), numerical_columns):
    average_pollutant_per_station[pollutant].sort_values(ascending=False).plot(kind='bar', ax=ax)
    ax.set_title(f'Average {pollutant} Levels by Station')
    ax.set_ylabel('Average Level')

plt.tight_layout()
plt.show()

average_pollutant_per_station
Out[56]:
SO2 NO2 O3 CO PM10 PM2.5
Station name(district)
Dobong-gu 0.004249 0.022366 0.027007 0.577858 43.476836 25.853363
Dongdaemun-gu 0.005399 0.030024 0.022892 0.520852 39.181641 23.230170
Dongjak-gu 0.003642 0.030059 0.025239 0.473793 41.794418 24.460256
Eunpyeong-gu 0.004256 0.024903 0.028471 0.553067 42.766211 25.158484
Gangbuk-gu 0.003178 0.021713 0.029482 0.463032 38.854965 21.739436
Gangdong-gu 0.004519 0.029935 0.021883 0.519003 45.361976 24.932772
Gangnam-gu 0.005250 0.029288 0.019895 0.465135 40.026230 24.307809
Gangseo-gu 0.005078 0.030313 0.026210 0.485949 54.801959 23.563652
Geumcheon-gu 0.003936 0.030880 0.023672 0.492682 39.982541 24.904506
Guro-gu 0.005648 0.029656 0.028276 0.411604 51.589377 30.869586
Gwanak-gu 0.004749 0.031230 0.024400 0.464334 45.973865 28.187193
Gwangjin-gu 0.004315 0.027828 0.023457 0.628364 45.839476 29.329425
Jongno-gu 0.004386 0.031628 0.024625 0.585405 38.017699 22.893540
Jung-gu 0.003591 0.032298 0.025522 0.504858 37.991307 22.877427
Jungnang-gu 0.005739 0.026040 0.023252 0.479176 38.209019 22.415185
Mapo-gu 0.003954 0.027271 0.024730 0.528877 47.771956 30.426119
Nowon-gu 0.004557 0.027053 0.024494 0.531892 40.433423 24.257778
Seocho-gu 0.004200 0.028358 0.025662 0.434477 54.040805 29.975835
Seodaemun-gu 0.004274 0.024067 0.025601 0.599368 42.530610 23.460195
Seongbuk-gu 0.003630 0.031340 0.022527 0.645122 45.934146 25.907135
Seongdong-gu 0.004412 0.028818 0.021654 0.485073 49.910950 25.529989
Songpa-gu 0.004032 0.029795 0.023431 0.569492 45.943192 24.057421
Yangcheon-gu 0.004266 0.031597 0.022208 0.524122 42.700504 25.736788
Yeongdeungpo-gu 0.004534 0.028877 0.023665 0.575928 50.562172 31.427645
Yongsan-gu 0.003516 0.029877 0.021763 0.447149 35.891700 23.877670
In [57]:
# Apply the function to each pollutant
for pollutant in conditions.keys():
    average_pollutant_per_station[f'{pollutant}_Category'] = average_pollutant_per_station[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
In [58]:
average_pollutant_per_station
Out[58]:
SO2 NO2 O3 CO PM10 PM2.5 SO2_Category NO2_Category CO_Category O3_Category PM10_Category PM2.5_Category
Station name(district)
Dobong-gu 0.004249 0.022366 0.027007 0.577858 43.476836 25.853363 Good Good Good Good Normal Normal
Dongdaemun-gu 0.005399 0.030024 0.022892 0.520852 39.181641 23.230170 Good Normal Good Good Normal Normal
Dongjak-gu 0.003642 0.030059 0.025239 0.473793 41.794418 24.460256 Good Normal Good Good Normal Normal
Eunpyeong-gu 0.004256 0.024903 0.028471 0.553067 42.766211 25.158484 Good Good Good Good Normal Normal
Gangbuk-gu 0.003178 0.021713 0.029482 0.463032 38.854965 21.739436 Good Good Good Good Normal Normal
Gangdong-gu 0.004519 0.029935 0.021883 0.519003 45.361976 24.932772 Good Good Good Good Normal Normal
Gangnam-gu 0.005250 0.029288 0.019895 0.465135 40.026230 24.307809 Good Good Good Good Normal Normal
Gangseo-gu 0.005078 0.030313 0.026210 0.485949 54.801959 23.563652 Good Normal Good Good Normal Normal
Geumcheon-gu 0.003936 0.030880 0.023672 0.492682 39.982541 24.904506 Good Normal Good Good Normal Normal
Guro-gu 0.005648 0.029656 0.028276 0.411604 51.589377 30.869586 Good Good Good Good Normal Normal
Gwanak-gu 0.004749 0.031230 0.024400 0.464334 45.973865 28.187193 Good Normal Good Good Normal Normal
Gwangjin-gu 0.004315 0.027828 0.023457 0.628364 45.839476 29.329425 Good Good Good Good Normal Normal
Jongno-gu 0.004386 0.031628 0.024625 0.585405 38.017699 22.893540 Good Normal Good Good Normal Normal
Jung-gu 0.003591 0.032298 0.025522 0.504858 37.991307 22.877427 Good Normal Good Good Normal Normal
Jungnang-gu 0.005739 0.026040 0.023252 0.479176 38.209019 22.415185 Good Good Good Good Normal Normal
Mapo-gu 0.003954 0.027271 0.024730 0.528877 47.771956 30.426119 Good Good Good Good Normal Normal
Nowon-gu 0.004557 0.027053 0.024494 0.531892 40.433423 24.257778 Good Good Good Good Normal Normal
Seocho-gu 0.004200 0.028358 0.025662 0.434477 54.040805 29.975835 Good Good Good Good Normal Normal
Seodaemun-gu 0.004274 0.024067 0.025601 0.599368 42.530610 23.460195 Good Good Good Good Normal Normal
Seongbuk-gu 0.003630 0.031340 0.022527 0.645122 45.934146 25.907135 Good Normal Good Good Normal Normal
Seongdong-gu 0.004412 0.028818 0.021654 0.485073 49.910950 25.529989 Good Good Good Good Normal Normal
Songpa-gu 0.004032 0.029795 0.023431 0.569492 45.943192 24.057421 Good Good Good Good Normal Normal
Yangcheon-gu 0.004266 0.031597 0.022208 0.524122 42.700504 25.736788 Good Normal Good Good Normal Normal
Yeongdeungpo-gu 0.004534 0.028877 0.023665 0.575928 50.562172 31.427645 Good Good Good Good Normal Normal
Yongsan-gu 0.003516 0.029877 0.021763 0.447149 35.891700 23.877670 Good Good Good Good Normal Normal

Map Station¶

In [59]:
# Function to map categories to colors
def map_color(category):
    if category == 1:
        return 'green'
    elif category == 2:
        return 'blue'
    elif category == 3:
        return 'yellow'
    else:
        return 'red'
In [60]:
# Create a map centered at an average latitude and longitude of the stations
# map_folium = folium.Map(location=[data['Latitude'].mean(), data['Longitude'].mean()], zoom_start=10)

# For each station, create a circle marker
# for idx, row in data.iterrows():
    # Convert the level of a pollutant to a category
    # category = categorize(row['PM2.5'], conditions['PM2.5'])

    # Convert the category to a color
    # color = map_color(category)

    # Adjust the size of the marker based on the level of the pollutant
    # size = row['PM2.5'] / 10

    # folium.CircleMarker(location=[row['Latitude'], row['Longitude']], radius=size, color=color, fill=True).add_to(map_folium)

# Show the map
# map_folium

# THE MAP NOT SHOWING IN COLAB
In [61]:
fig = px.scatter_geo(data_merged, lat='Latitude', lon='Longitude', color='Quality')
fig.show()

# THE MAP NOT SHOWING IN COLAB

Other Analysis¶

Bivariate Analysis¶

In [62]:
# Scatter plots between pairs of pollutants
fig, axes = plt.subplots(3, 2, figsize=(15, 10))

for ax, (pollutant1, pollutant2) in zip(axes.flatten(), [('SO2', 'NO2'), ('SO2', 'O3'), ('NO2', 'O3'), ('PM10', 'PM2.5'), ('SO2', 'PM2.5'), ('NO2', 'PM10')]):
    sns.scatterplot(data=data.sample(frac=0.01, random_state=1), x=pollutant1, y=pollutant2, ax=ax)
    ax.set_title(f'{pollutant1} vs {pollutant2}')

plt.tight_layout()
plt.show()

Distribution Analysis¶

In [63]:
# Visualize the distribution of each pollutant using histograms and QQ plots
fig, axes = plt.subplots(6, 2, figsize=(15, 20))

for i, pollutant in enumerate(numerical_columns):
    # Histogram
    sns.histplot(data=data, x=pollutant, kde=True, ax=axes[i, 0])
    axes[i, 0].set_title(f'Histogram of {pollutant}')

    # QQ plot
    stats.probplot(data[pollutant], plot=axes[i, 1])
    axes[i, 1].set_title(f'QQ Plot of {pollutant}')

plt.tight_layout()
plt.show()

Pairplots¶

In [64]:
# Due to the large size of the dataset, Sample a subset of the data for the pairplot
sample_data = data.sample(frac=0.1, random_state=1)
sns.pairplot(sample_data[numerical_columns])

plt.show()

Resample Data¶

In [65]:
# data.set_index('Measurement date', inplace=True)

# Resample to daily frequency
data_daily = data.resample('D').mean()

# Resample to Monthly frequency
data_Month = data.resample('M').mean()
C:\Users\Fajri\AppData\Local\Temp\ipykernel_14984\3108603987.py:4: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

C:\Users\Fajri\AppData\Local\Temp\ipykernel_14984\3108603987.py:7: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

Prediction Models¶

In [66]:
# Prepare sequence data
def create_sequences(data, seq_length):
    Xs, ys = [], []
    for i in range(len(data)-seq_length):
        Xs.append(data[i:(i+seq_length)])
        ys.append(data[i+seq_length])
    return np.array(Xs), np.array(ys)

# Select the pollutants and convert to numpy array
pollutants = ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']
data_pollutants = data[pollutants].values

# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data_pollutants)

# Choose sequence length
seq_length = 5

# Create sequences
X, y = create_sequences(data_scaled, seq_length)

# Split the data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=False)

Tambahan Tidak dipakai¶

In [67]:
# data = data.drop(columns=["level_0","index"])
data = data.reset_index()
In [68]:
# Melt the pollutants and their categories into single columns
data_melted = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter'],
                        value_vars=['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5'],
                        var_name='Air Pollutants', value_name='Measurement Value')

data_melted_categories = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter'],
                                   value_vars=['SO2_Category', 'NO2_Category', 'CO_Category', 'O3_Category', 'PM10_Category', 'PM2.5_Category'],
                                   var_name='Air Pollutants', value_name='Quality')
In [69]:
# Remove the "_Category" part from the 'Air Pollutants' column in the categories dataframe
data_melted_categories['Air Pollutants'] = data_melted_categories['Air Pollutants'].str.replace('_Category', '')
In [70]:
# Merge the two dataframes on the common columns
data_merged = pd.merge(data_melted, data_melted_categories,
                       on=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter','Air Pollutants'])
In [71]:
data_merged.head(5)
Out[71]:
Measurement date Station code Station name(district) Address Latitude Longitude Year Month Quarter Air Pollutants Measurement Value Quality
0 2017-01-01 00:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 2017 1 1 SO2 0.004 1
1 2017-01-01 01:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 2017 1 1 SO2 0.004 1
2 2017-01-01 02:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 2017 1 1 SO2 0.004 1
3 2017-01-01 03:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 2017 1 1 SO2 0.004 1
4 2017-01-01 04:00:00 101 Jongno-gu 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... 37.572016 127.005008 2017 1 1 SO2 0.003 1
In [72]:
# Create a dictionary to map numbers to categories
numeric_to_category = {1 : 'Good', 2 : 'Normal', 3 : 'Bad', 4 : 'Very Bad'}

# Changing categories to be numeric for each category column
for pollutant in ['Quality']:
    data_merged[f'{pollutant}'] = data_merged[f'{pollutant}'].map(numeric_to_category)
In [73]:
data_merged.sample(5)
Out[73]:
Measurement date Station code Station name(district) Address Latitude Longitude Year Month Quarter Air Pollutants Measurement Value Quality
960639 2017-04-08 03:00:00 113 Dobong-gu 34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub... 37.654192 127.029088 2017 4 2 NO2 0.037 Normal
1870534 2017-08-27 02:00:00 123 Gangnam-gu 426, Hakdong-ro, Gangnam-gu, Seoul, Republic o... 37.517528 127.047470 2017 8 3 CO 0.400 Good
627508 2017-09-03 21:00:00 125 Gangdong-gu 59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul... 37.544962 127.136792 2017 9 3 SO2 0.004 Good
456870 2018-11-22 14:00:00 118 Geumcheon-gu 20, Geumha-ro 21-gil, Geumcheon-gu, Seoul, Rep... 37.452357 126.908296 2018 11 4 SO2 0.004 Good
49679 2019-10-03 17:00:00 102 Jung-gu 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... 37.564263 126.974676 2019 10 4 SO2 0.003 Good
In [74]:
# data_merged.to_csv("Measurement_summary_fix_pisan_bangets.csv",index=False)
In [ ]: